# This code extracts all necessary variables from original datasets
## This R code was written by Eric Guntermann. Questions and/or comments can be sent to eric.guntermann@umontreal.ca
## CSES modules 1 to 3 were downloaded on 09/27/2013 from http://www.cses.org/. CSES module 4 was downloaded on 04/11/2015
## Note: to run this file in R, you must set the directory containing it as your working directory.

# Set working directory
setwd("/Users/ericguntermann/Documents/Papers/Representation of Party Preferences/Replication")

library(foreign) # Used for some datasets

## CSES1

## Load dataset
cses1 <- read.dta("cses1.dta")

## Apply corrections (from CSES site)
#(1)Great Britain (1997) - Macro Variables A5005_A-A5005_E, A5006_A-A5006_E, A5010, A5026_1, and A5027_1
cses1$A5005_A[cses1$A1004 == "GBR_1997"] = 44.33
cses1$A5005_B[cses1$A1004 == "GBR_1997"] = 31.45
cses1$A5005_C[cses1$A1004 == "GBR_1997"] = 17.19
cses1$A5005_D[cses1$A1004 == "GBR_1997"] =  2.04
cses1$A5005_E[cses1$A1004 == "GBR_1997"] =  0.53
cses1$A5006_A[cses1$A1004 == "GBR_1997"] =  65.21
cses1$A5006_B[cses1$A1004 == "GBR_1997"] =  25.74
cses1$A5006_C[cses1$A1004 == "GBR_1997"] =  7.18
cses1$A5006_D[cses1$A1004 == "GBR_1997"] = 0.94
cses1$A5006_E[cses1$A1004 == "GBR_1997"] = 0.31
cses1$A5010[cses1$A1004 == "GBR_1997"] = 71.57
cses1$A5026_1[cses1$A1004 == "GBR_1997"] = 641
cses1$A5027_1[cses1$A1004 == "GBR_1997"] = 641

## Create identification variables. These are numeric country code, country name in English starting with a capital letter, year in four digits, numeric respondent identifier, and country-year(combining country name and year).
cses1.id <- cses1[ , c(6,8,9)]
cses1.id$country.num <- cses1.id$A1006
cses1.id$country <- droplevels(as.factor(cses1.id$country.num))
cses1.id$year <- as.character(cses1.id$A1008)
cses1.id$respondent <- as.character(cses1.id$A1009)

levels(cses1.id$country) <- c("Australia", "Belgium_Flanders", "Belgium-Walloon", "Belarus", "Canada", "Chile", "Taiwan", "Czech Republic", "Denmark", "Germany", "Hong Kong", "Hungary", "Iceland", "Israel", "Japan", "Korea", "Lithuania", "Mexico", "Netherlands", "New Zealand", "Norway",
                              "Peru", "Poland", "Portugal", "Romania", "Russia", "Slovenia", "Spain", "Sweden", "Switzerland", "Thailand", "Ukraine", "Great Britain", "United States")
cses1.id$country <- as.character(cses1.id$country)
cses1.id$countryyear <- paste(cses1.id$country, cses1.id$year, sep=" ")
cses1.id <- cses1.id[, 4:8]

## Generate variable for type of election
cses1.type <- cses1$A1015

# Party like-dislike scales. These are taken from CSES variables A3020_A, A3020_B, ... A3020_I
cses1.vars <- cses1[ ,98:106] 
cses1.vars$A3020_A[cses1.vars$A3020_A==96] <- NA
cses1.vars$A3020_A[cses1.vars$A3020_A==97] <- NA
cses1.vars$A3020_A[cses1.vars$A3020_A==98] <- NA
cses1.vars$A3020_A[cses1.vars$A3020_A==99] <- NA
cses1.vars$A3020_B[cses1.vars$A3020_B==96] <- NA
cses1.vars$A3020_B[cses1.vars$A3020_B==97] <- NA
cses1.vars$A3020_B[cses1.vars$A3020_B==98] <- NA
cses1.vars$A3020_B[cses1.vars$A3020_B==99] <- NA
cses1.vars$A3020_C[cses1.vars$A3020_C==96] <- NA
cses1.vars$A3020_C[cses1.vars$A3020_C==97] <- NA
cses1.vars$A3020_C[cses1.vars$A3020_C==98] <- NA
cses1.vars$A3020_C[cses1.vars$A3020_C==99] <- NA
cses1.vars$A3020_D[cses1.vars$A3020_D==96] <- NA
cses1.vars$A3020_D[cses1.vars$A3020_D==97] <- NA
cses1.vars$A3020_D[cses1.vars$A3020_D==98] <- NA
cses1.vars$A3020_D[cses1.vars$A3020_D==99] <- NA
cses1.vars$A3020_E[cses1.vars$A3020_E==96] <- NA
cses1.vars$A3020_E[cses1.vars$A3020_E==97] <- NA
cses1.vars$A3020_E[cses1.vars$A3020_E==98] <- NA
cses1.vars$A3020_E[cses1.vars$A3020_E==99] <- NA
cses1.vars$A3020_F[cses1.vars$A3020_F==96] <- NA
cses1.vars$A3020_F[cses1.vars$A3020_F==97] <- NA
cses1.vars$A3020_F[cses1.vars$A3020_F==98] <- NA
cses1.vars$A3020_F[cses1.vars$A3020_F==99] <- NA
cses1.vars$A3020_G[cses1.vars$A3020_G==96] <- NA
cses1.vars$A3020_G[cses1.vars$A3020_G==97] <- NA
cses1.vars$A3020_G[cses1.vars$A3020_G==98] <- NA
cses1.vars$A3020_G[cses1.vars$A3020_G==99] <- NA
cses1.vars$A3020_H[cses1.vars$A3020_H==96] <- NA
cses1.vars$A3020_H[cses1.vars$A3020_H==97] <- NA
cses1.vars$A3020_H[cses1.vars$A3020_H==98] <- NA
cses1.vars$A3020_H[cses1.vars$A3020_H==99] <- NA
cses1.vars$A3020_I[cses1.vars$A3020_I==96] <- NA
cses1.vars$A3020_I[cses1.vars$A3020_I==97] <- NA
cses1.vars$A3020_I[cses1.vars$A3020_I==98] <- NA
cses1.vars$A3020_I[cses1.vars$A3020_I==99] <- NA
colnames(cses1.vars) <- c("likedislike1", "likedislike2", "likedislike3", "likedislike4", "likedislike5", "likedislike6", "likedislike7", "likedislike8", "likedislike9")
 #convert all to numeric
for(i in 1:dim(cses1.vars)[2]){
  cses1.vars[,i] <- as.numeric(cses1.vars[,i])-1
}

## Individual-level data on political information, education, and income
cses1.ind <- cses1[, c("A2003", "A2012", "A2023", "A2024", "A2025", "A2028")]
colnames(cses1.ind) <- c("education", "income", "polinfo1", "polinfo2", "polinfo3", "voted")
cses1.ind$income[cses1.ind$income==8 | cses1.ind$income==9] <- NA
cses1.ind$education[cses1.ind$education==9] <- NA
cses1.ind$polinfo1[cses1.ind$polinfo1==9] <- NA
cses1.ind$polinfo2[cses1.ind$polinfo2==9] <- NA
cses1.ind$polinfo3[cses1.ind$polinfo3==9] <- NA
cses1.ind$voted[as.numeric(cses1.ind$voted)>5] <- NA
cses1.ind$voted[as.numeric(cses1.ind$voted)==0] <- NA
cses1.ind$voted <- ifelse(as.numeric(cses1.ind$voted)==1,1,0)
# Convert all to numeric
for(i in 1:dim(cses1.ind)[2]){
  cses1.ind[,i] <- as.numeric(cses1.ind[,i])
}
## Gallagher disproportionality index using the vote percentage of each party in the lower house (variables A5005_A to A5005_F in CSES) and their seat percentage (variables A5006_A to A5006_F in CSES)
# Note Gallagher index is calculated for up to six parties
cses1.disprop <- cses1[, 182:193]
cses1.disprop$votes.party1 <- cses1$A5005_A
cses1.disprop$votes.party1[cses1.disprop$votes.party1==997] <- NA
cses1.disprop$votes.party1[cses1.disprop$votes.party1==998] <- NA
cses1.disprop$votes.party1[cses1.disprop$votes.party1==999] <- NA
cses1.disprop$A5005_A <- NULL
cses1.disprop$votes.party2 <- cses1$A5005_B
cses1.disprop$votes.party2[cses1.disprop$votes.party2==997] <- NA
cses1.disprop$votes.party2[cses1.disprop$votes.party2==998] <- NA
cses1.disprop$votes.party2[cses1.disprop$votes.party2==999] <- NA
cses1.disprop$A5005_B <- NULL
cses1.disprop$votes.party3 <- cses1$A5005_C
cses1.disprop$votes.party3[cses1.disprop$votes.party3==997] <- NA
cses1.disprop$votes.party3[cses1.disprop$votes.party3==998] <- NA
cses1.disprop$votes.party3[cses1.disprop$votes.party3==999] <- NA
cses1.disprop$A5005_C <- NULL
cses1.disprop$votes.party4 <- cses1$A5005_D
cses1.disprop$votes.party4[cses1.disprop$votes.party4==997] <- NA
cses1.disprop$votes.party4[cses1.disprop$votes.party4==998] <- NA
cses1.disprop$votes.party4[cses1.disprop$votes.party4==999] <- NA
cses1.disprop$A5005_D <- NULL
cses1.disprop$votes.party5 <- cses1$A5005_E
cses1.disprop$votes.party5[cses1.disprop$votes.party5==997] <- NA
cses1.disprop$votes.party5[cses1.disprop$votes.party5==998] <- NA
cses1.disprop$votes.party5[cses1.disprop$votes.party5==999] <- NA
cses1.disprop$A5005_E <- NULL
cses1.disprop$votes.party6 <- cses1$A5005_F
cses1.disprop$votes.party6[cses1.disprop$votes.party6==997] <- NA
cses1.disprop$votes.party6[cses1.disprop$votes.party6==998] <- NA
cses1.disprop$votes.party6[cses1.disprop$votes.party6==999] <- NA
cses1.disprop$A5005_F <- NULL

cses1.disprop$seats.party1 <- cses1$A5006_A
cses1.disprop$seats.party1[cses1.disprop$seats.party1==997] <- NA
cses1.disprop$seats.party1[cses1.disprop$seats.party1==998] <- NA
cses1.disprop$seats.party1[cses1.disprop$seats.party1==999] <- NA
cses1.disprop$A5006_A <- NULL
cses1.disprop$seats.party2 <- cses1$A5006_B
cses1.disprop$seats.party2[cses1.disprop$seats.party2==997] <- NA
cses1.disprop$seats.party2[cses1.disprop$seats.party2==998] <- NA
cses1.disprop$seats.party2[cses1.disprop$seats.party2==999] <- NA
cses1.disprop$A5006_B <- NULL
cses1.disprop$seats.party3 <- cses1$A5006_C
cses1.disprop$seats.party3[cses1.disprop$seats.party3==997] <- NA
cses1.disprop$seats.party3[cses1.disprop$seats.party3==998] <- NA
cses1.disprop$seats.party3[cses1.disprop$seats.party3==999] <- NA
cses1.disprop$A5006_C <- NULL
cses1.disprop$seats.party4 <- cses1$A5006_D
cses1.disprop$seats.party4[cses1.disprop$seats.party4==997] <- NA
cses1.disprop$seats.party4[cses1.disprop$seats.party4==998] <- NA
cses1.disprop$seats.party4[cses1.disprop$seats.party4==999] <- NA
cses1.disprop$A5006_D <- NULL
cses1.disprop$seats.party5 <- cses1$A5006_E
cses1.disprop$seats.party5[cses1.disprop$seats.party5==997] <- NA
cses1.disprop$seats.party5[cses1.disprop$seats.party5==998] <- NA
cses1.disprop$seats.party5[cses1.disprop$seats.party5==999] <- NA
cses1.disprop$A5006_E <- NULL
cses1.disprop$seats.party6 <- cses1$A5006_F
cses1.disprop$seats.party6[cses1.disprop$seats.party6==997] <- NA
cses1.disprop$seats.party6[cses1.disprop$seats.party6==998] <- NA
cses1.disprop$seats.party6[cses1.disprop$seats.party6==999] <- NA
cses1.disprop$A5006_F <- NULL

for(i in 1:6){
  cses1.disprop[,i+12] <- (cses1.disprop[,i] - cses1.disprop[,i+6])^2
}

cses1.disprop$sumdiff <- apply(cses1.disprop[,13:18],1, sum, na.rm=T)
cses1.disprop$gallagher <- sqrt(cses1.disprop$sumdiff/2)  
cses1.gallagher <- cses1.disprop$gallagher

##Calculate mean district magnitude
cses1.mdm <- cses1[, c("A5027_1", "A5027_2", "A5026_1", "A5026_2", "A5029_1", "A5029_2", "A5030_1", "A5030_2")]
cses1.mdm$A5027_1[cses1.mdm$A5027_1==999 | cses1.mdm$A5027_1==0] <- NA
cses1.mdm$A5027_2[cses1.mdm$A5027_2==999 | cses1.mdm$A5027_2==0] <- NA
cses1.mdm$A5026_1[cses1.mdm$A5026_1==999 | cses1.mdm$A5026_1==0] <- NA
cses1.mdm$A5026_2[cses1.mdm$A5026_2==999 | cses1.mdm$A5026_2==0] <- NA
cses1.mdm$A5029_1[cses1.mdm$A5029_1==999 | cses1.mdm$A5029_1==0] <- NA
cses1.mdm$A5029_2[cses1.mdm$A5029_2==999 | cses1.mdm$A5029_2==0] <- NA
cses1.mdm$A5030_1[cses1.mdm$A5030_1==999 | cses1.mdm$A5030_1==0] <- NA
cses1.mdm$A5030_2[cses1.mdm$A5030_2==999 | cses1.mdm$A5030_2==0] <- NA

for (i in 1:length(cses1$A5027_1)){
  cses1.mdm$mdm[i] <- apply(cses1.mdm[i,1:2], 1, sum, na.rm=T)/(apply(cses1.mdm[i,3:7], 1, sum, na.rm=T))
}

## Get weights
cses1.weights <- cses1[, c("A1010_1", "A1010_2", "A1010_3")]
colnames(cses1.weights) <- c("sampleweight", "demoweight", "polweight")

# Adjust German weights. Demographic weights are included in political weights
cses1.weights[cses1.id$countryyear=="Germany 1998", 2] <- 1

# Adjust Dutch Weights. Demographic weights are included in political weights
cses1.weights[cses1.id$countryyear=="Netherlands 1998", 2] <- 1

# Party voted for
cses1.vote <- cses1[, c("A2030", "A2031")]
colnames(cses1.vote) <- c("vote1", "vote2")
cses1.vote$vote1[cses1.vote$vote1 %in% c(0,98,99)] <- NA
cses1.vote$vote2[cses1.vote$vote2 %in% c(0,98,99)] <- NA

## CSES 2

cses2 <- read.csv("cses2_rawdata.txt", header=TRUE)

## Remove German mail-back survey
cses2 <- cses2[cses2$B1003!=27622002,]

## Apply corrections to Italy 2006 data

source("italycorrection.R")

## Create identification variables. These are numeric country code, country name in English starting with a capital letter, year in four digits, numeric respondent identifier, and country-year(combining country name and year).
cses2.id <- cses2[ , c(6,8,9)]
cses2.id$country.num <- cses2.id$B1006
cses2.id$country <- as.factor(cses2.id$country.num)
cses2.id$year <- as.character(cses2.id$B1008)
cses2.id$respondent <- as.character(cses2.id$B1009)
levels(cses2.id$country) <- c("Albania", "Australia", "Belgium", "Brazil", "Bulgaria", "Canada", "Chile", "Taiwan", "Czech Republic", "Denmark", "Finland", "France", "Germany", "Hong Kong", "Hungary", "Iceland", "Ireland", "Israel", "Italy", "Japan", "Korea", "Kyrgyzstan", "Mexico", "Netherlands", "New Zealand", "Norway",
                              "Peru", "Philippines", "Poland", "Portugal", "Romania", "Russia", "Slovenia", "Spain", "Sweden", "Switzerland", "Great Britain", "United States")
cses2.id$country <- as.character(cses2.id$country)
cses2.id$countryyear <- paste(cses2.id$country, cses2.id$year, sep=" ")
cses2.id <- cses2.id[, 4:8]

## Generate variable for type of election
cses2.type <- cses2$B1015

## Party like-dislike scales. These are taken from CSES variables B3037_A, B3037_B, ... B3037_I
cses2.vars <- cses2[ ,120:128] 
cses2.vars$B3037_A[cses2.vars$B3037_A==96] <- NA
cses2.vars$B3037_A[cses2.vars$B3037_A==97] <- NA
cses2.vars$B3037_A[cses2.vars$B3037_A==98] <- NA
cses2.vars$B3037_A[cses2.vars$B3037_A==99] <- NA
cses2.vars$B3037_B[cses2.vars$B3037_B==96] <- NA
cses2.vars$B3037_B[cses2.vars$B3037_B==97] <- NA
cses2.vars$B3037_B[cses2.vars$B3037_B==98] <- NA
cses2.vars$B3037_B[cses2.vars$B3037_B==99] <- NA
cses2.vars$B3037_C[cses2.vars$B3037_C==96] <- NA
cses2.vars$B3037_C[cses2.vars$B3037_C==97] <- NA
cses2.vars$B3037_C[cses2.vars$B3037_C==98] <- NA
cses2.vars$B3037_C[cses2.vars$B3037_C==99] <- NA
cses2.vars$B3037_D[cses2.vars$B3037_D==96] <- NA
cses2.vars$B3037_D[cses2.vars$B3037_D==97] <- NA
cses2.vars$B3037_D[cses2.vars$B3037_D==98] <- NA
cses2.vars$B3037_D[cses2.vars$B3037_D==99] <- NA
cses2.vars$B3037_E[cses2.vars$B3037_E==96] <- NA
cses2.vars$B3037_E[cses2.vars$B3037_E==97] <- NA
cses2.vars$B3037_E[cses2.vars$B3037_E==98] <- NA
cses2.vars$B3037_E[cses2.vars$B3037_E==99] <- NA
cses2.vars$B3037_F[cses2.vars$B3037_F==96] <- NA
cses2.vars$B3037_F[cses2.vars$B3037_F==97] <- NA
cses2.vars$B3037_F[cses2.vars$B3037_F==98] <- NA
cses2.vars$B3037_F[cses2.vars$B3037_F==99] <- NA
cses2.vars$B3037_G[cses2.vars$B3037_G==96] <- NA
cses2.vars$B3037_G[cses2.vars$B3037_G==97] <- NA
cses2.vars$B3037_G[cses2.vars$B3037_G==98] <- NA
cses2.vars$B3037_G[cses2.vars$B3037_G==99] <- NA
cses2.vars$B3037_H[cses2.vars$B3037_H==96] <- NA
cses2.vars$B3037_H[cses2.vars$B3037_H==97] <- NA
cses2.vars$B3037_H[cses2.vars$B3037_H==98] <- NA
cses2.vars$B3037_H[cses2.vars$B3037_H==99] <- NA
cses2.vars$B3037_I[cses2.vars$B3037_I==96] <- NA
cses2.vars$B3037_I[cses2.vars$B3037_I==97] <- NA
cses2.vars$B3037_I[cses2.vars$B3037_I==98] <- NA
cses2.vars$B3037_I[cses2.vars$B3037_I==99] <- NA
colnames(cses2.vars) <- c("likedislike1", "likedislike2", "likedislike3", "likedislike4", "likedislike5", "likedislike6", "likedislike7", "likedislike8", "likedislike9")

## Individual-level data on political information, education, and income
cses2.ind <- cses2[, c("B2003", "B2020", "B3047_1", "B3047_2", "B3047_3", "B3004_1")]
colnames(cses2.ind) <- c("education", "income", "polinfo1", "polinfo2", "polinfo3", "voted")
cses2.ind$income[cses2.ind$income > 5] <- NA
cses2.ind$education[cses2.ind$education > 8] <- NA
cses2.ind$polinfo1[cses2.ind$polinfo1 > 8] <- NA
cses2.ind$polinfo2[cses2.ind$polinfo2 > 8] <- NA
cses2.ind$polinfo3[cses2.ind$polinfo3 > 8] <- NA
cses2.ind$voted[cses2.ind$voted>2] <- NA
cses2.ind$voted <- ifelse(cses2.ind$voted==1,1,0)


## Gallagher disproportionality index using the vote percentage of each party in the lower house (variables B5001_A to B5001_I in CSES) and their seat percentage (variables B5002_A to B5002_I in CSES)
cses2.disprop <- cses2[, 185:202]
cses2.disprop$votes.party1 <- cses2$B5001_A
cses2.disprop$votes.party1[cses2.disprop$votes.party1==996] <- NA
cses2.disprop$votes.party1[cses2.disprop$votes.party1==997] <- NA
cses2.disprop$votes.party1[cses2.disprop$votes.party1==998] <- NA
cses2.disprop$votes.party1[cses2.disprop$votes.party1==999] <- NA
cses2.disprop$B5001_A <- NULL
cses2.disprop$votes.party2 <- cses2$B5001_B
cses2.disprop$votes.party2[cses2.disprop$votes.party2==996] <- NA
cses2.disprop$votes.party2[cses2.disprop$votes.party2==997] <- NA
cses2.disprop$votes.party2[cses2.disprop$votes.party2==998] <- NA
cses2.disprop$votes.party2[cses2.disprop$votes.party2==999] <- NA
cses2.disprop$B5001_B <- NULL
cses2.disprop$votes.party3 <- cses2$B5001_C
cses2.disprop$votes.party3[cses2.disprop$votes.party3==996] <- NA
cses2.disprop$votes.party3[cses2.disprop$votes.party3==997] <- NA
cses2.disprop$votes.party3[cses2.disprop$votes.party3==998] <- NA
cses2.disprop$votes.party3[cses2.disprop$votes.party3==999] <- NA
cses2.disprop$B5001_C <- NULL
cses2.disprop$votes.party4 <- cses2$B5001_D
cses2.disprop$votes.party4[cses2.disprop$votes.party4==996] <- NA
cses2.disprop$votes.party4[cses2.disprop$votes.party4==997] <- NA
cses2.disprop$votes.party4[cses2.disprop$votes.party4==998] <- NA
cses2.disprop$votes.party4[cses2.disprop$votes.party4==999] <- NA
cses2.disprop$B5001_D <- NULL
cses2.disprop$votes.party5 <- cses2$B5001_E
cses2.disprop$votes.party5[cses2.disprop$votes.party5==996] <- NA
cses2.disprop$votes.party5[cses2.disprop$votes.party5==997] <- NA
cses2.disprop$votes.party5[cses2.disprop$votes.party5==998] <- NA
cses2.disprop$votes.party5[cses2.disprop$votes.party5==999] <- NA
cses2.disprop$B5001_E <- NULL
cses2.disprop$votes.party6 <- cses2$B5001_F
cses2.disprop$votes.party6[cses2.disprop$votes.party6==996] <- NA
cses2.disprop$votes.party6[cses2.disprop$votes.party6==997] <- NA
cses2.disprop$votes.party6[cses2.disprop$votes.party6==998] <- NA
cses2.disprop$votes.party6[cses2.disprop$votes.party6==999] <- NA
cses2.disprop$B5001_F <- NULL
cses2.disprop$votes.party7 <- cses2$B5001_G
cses2.disprop$votes.party7[cses2.disprop$votes.party7==996] <- NA
cses2.disprop$votes.party7[cses2.disprop$votes.party7==997] <- NA
cses2.disprop$votes.party7[cses2.disprop$votes.party7==998] <- NA
cses2.disprop$votes.party7[cses2.disprop$votes.party7==999] <- NA
cses2.disprop$B5001_G <- NULL
cses2.disprop$votes.party8 <- cses2$B5001_H
cses2.disprop$votes.party8[cses2.disprop$votes.party8==996] <- NA
cses2.disprop$votes.party8[cses2.disprop$votes.party8==997] <- NA
cses2.disprop$votes.party8[cses2.disprop$votes.party8==998] <- NA
cses2.disprop$votes.party8[cses2.disprop$votes.party8==999] <- NA
cses2.disprop$B5001_H <- NULL
cses2.disprop$votes.party9 <- cses2$B5001_I
cses2.disprop$votes.party9[cses2.disprop$votes.party9==996] <- NA
cses2.disprop$votes.party9[cses2.disprop$votes.party9==997] <- NA
cses2.disprop$votes.party9[cses2.disprop$votes.party9==998] <- NA
cses2.disprop$votes.party9[cses2.disprop$votes.party9==999] <- NA
cses2.disprop$B5001_I <- NULL


cses2.disprop$seats.party1 <- cses2$B5002_A
cses2.disprop$seats.party1[cses2.disprop$seats.party1==996] <- NA
cses2.disprop$seats.party1[cses2.disprop$seats.party1==997] <- NA
cses2.disprop$seats.party1[cses2.disprop$seats.party1==998] <- NA
cses2.disprop$seats.party1[cses2.disprop$seats.party1==999] <- NA
cses2.disprop$B5002_A <- NULL
cses2.disprop$seats.party2 <- cses2$B5002_B
cses2.disprop$seats.party2[cses2.disprop$seats.party2==996] <- NA
cses2.disprop$seats.party2[cses2.disprop$seats.party2==997] <- NA
cses2.disprop$seats.party2[cses2.disprop$seats.party2==998] <- NA
cses2.disprop$seats.party2[cses2.disprop$seats.party2==999] <- NA
cses2.disprop$B5002_B <- NULL
cses2.disprop$seats.party3 <- cses2$B5002_C
cses2.disprop$seats.party3[cses2.disprop$seats.party3==996] <- NA
cses2.disprop$seats.party3[cses2.disprop$seats.party3==997] <- NA
cses2.disprop$seats.party3[cses2.disprop$seats.party3==998] <- NA
cses2.disprop$seats.party3[cses2.disprop$seats.party3==999] <- NA
cses2.disprop$B5002_C <- NULL
cses2.disprop$seats.party4 <- cses2$B5002_D
cses2.disprop$seats.party4[cses2.disprop$seats.party4==996] <- NA
cses2.disprop$seats.party4[cses2.disprop$seats.party4==997] <- NA
cses2.disprop$seats.party4[cses2.disprop$seats.party4==998] <- NA
cses2.disprop$seats.party4[cses2.disprop$seats.party4==999] <- NA
cses2.disprop$B5002_D <- NULL
cses2.disprop$seats.party5 <- cses2$B5002_E
cses2.disprop$seats.party5[cses2.disprop$seats.party5==996] <- NA
cses2.disprop$seats.party5[cses2.disprop$seats.party5==997] <- NA
cses2.disprop$seats.party5[cses2.disprop$seats.party5==998] <- NA
cses2.disprop$seats.party5[cses2.disprop$seats.party5==999] <- NA
cses2.disprop$B5002_E <- NULL
cses2.disprop$seats.party6 <- cses2$B5002_F
cses2.disprop$seats.party6[cses2.disprop$seats.party6==996] <- NA
cses2.disprop$seats.party6[cses2.disprop$seats.party6==997] <- NA
cses2.disprop$seats.party6[cses2.disprop$seats.party6==998] <- NA
cses2.disprop$seats.party6[cses2.disprop$seats.party6==999] <- NA
cses2.disprop$B5002_F <- NULL
cses2.disprop$seats.party7 <- cses2$B5002_G
cses2.disprop$seats.party7[cses2.disprop$seats.party7==996] <- NA
cses2.disprop$seats.party7[cses2.disprop$seats.party7==997] <- NA
cses2.disprop$seats.party7[cses2.disprop$seats.party7==998] <- NA
cses2.disprop$seats.party7[cses2.disprop$seats.party7==999] <- NA
cses2.disprop$B5002_G <- NULL
cses2.disprop$seats.party8 <- cses2$B5002_H
cses2.disprop$seats.party8[cses2.disprop$seats.party8==996] <- NA
cses2.disprop$seats.party8[cses2.disprop$seats.party8==997] <- NA
cses2.disprop$seats.party8[cses2.disprop$seats.party8==998] <- NA
cses2.disprop$seats.party8[cses2.disprop$seats.party8==999] <- NA
cses2.disprop$B5002_H <- NULL
cses2.disprop$seats.party9 <- cses2$B5002_I
cses2.disprop$seats.party9[cses2.disprop$seats.party9==996] <- NA
cses2.disprop$seats.party9[cses2.disprop$seats.party9==997] <- NA
cses2.disprop$seats.party9[cses2.disprop$seats.party9==998] <- NA
cses2.disprop$seats.party9[cses2.disprop$seats.party9==999] <- NA
cses2.disprop$B5002_I <- NULL

for(i in 1:9){
  cses2.disprop[,i+18] <- (cses2.disprop[,i] - cses2.disprop[,i+9])^2
}

cses2.disprop$sumdiff <- apply(cses2.disprop[,19:27],1, sum, na.rm=T)
cses2.disprop$gallagher <- sqrt(cses2.disprop$sumdiff/2)  
cses2.gallagher <- cses2.disprop$gallagher

## Calculate mean district magnitude
# Note that up to two electoral segments are included in this calculation

cses2.mdm <- cses2[, c("B5033_1", "B5033_2", "B5032_1", "B5032_2", "B5035_1", "B5035_2", "B5036_1", "B5036_2")]
cses2.mdm$B5033_1[cses2.mdm$B5033_1==999 | cses2.mdm$B5033_1==0] <- NA
cses2.mdm$B5033_2[cses2.mdm$B5033_2==999 | cses2.mdm$B5033_2==0] <- NA
cses2.mdm$B5032_1[cses2.mdm$B5032_1==999 | cses2.mdm$B5032_1==0] <- NA
cses2.mdm$B5032_2[cses2.mdm$B5032_2==999 | cses2.mdm$B5032_2==0] <- NA
cses2.mdm$B5035_1[cses2.mdm$B5035_1==999 | cses2.mdm$B5035_1==0] <- NA
cses2.mdm$B5035_2[cses2.mdm$B5035_2==999 | cses2.mdm$B5035_2==0] <- NA
cses2.mdm$B5036_1[cses2.mdm$B5036_1==999 | cses2.mdm$B5036_1==0] <- NA
cses2.mdm$B5036_2[cses2.mdm$B5036_2==999 | cses2.mdm$B5036_2==0] <- NA


for (i in 1:length(cses2$B5033_1)){
  cses2.mdm$mdm[i] <- apply(cses2.mdm[i,1:2], 1, sum, na.rm=T)/(apply(cses2.mdm[i,3:8], 1, sum, na.rm=T))
}

## Get weights
cses2.weights <- cses2[, c("B1010_1", "B1010_2", "B1010_3")]
colnames(cses2.weights) <- c("sampleweight", "demoweight", "polweight")

# Adjust Belgium weights. Political weights take into account demographics
cses2.weights$demoweight[cses2.id$country=="Belgium"] <- 1

# Adjust German weights. Sample and demographic weights are duplicates
cses2.weights$demoweight[cses2.id$country=="Germany"] <- 1

# Adjust Great Britain weights. Sample and demographic weights are duplicates
cses2.weights$demoweight[cses2.id$country=="Great Britain"] <- 1

# Adjust Ireland weights. Sample and demographic weights are duplicates. Also divide by 1000.
cses2.weights$demoweight[cses2.id$country=="Ireland"] <- 1
cses2.weights$sampleweight[cses2.id$country=="Ireland"] <- cses2.weights$sampleweight[cses2.id$country=="Ireland"]/1000

# Adjust Netherlands weights. Political weights include demographic weights
cses2.weights$demoweight[cses2.id$country=="Netherlands"] <-1 

# Adjust New Zealand weights. Political weights includes sample and demographic weights
cses2.weights$sampleweight[cses2.id$country=="New Zealand"] <- 1
cses2.weights$demoweight[cses2.id$country=="New Zealand"] <- 1

# Asjust Poland weights. Sample and demographic weights are duplicates
cses2.weights$demoweight[cses2.id$country=="Poland"] <- 1

# Adjust Portugal 2005 weights. There is lots of missing data.
cses2.weights$demoweight[cses2.id$countryyear=="Portugal 2005"] <- 1
cses2.weights$polweight[cses2.id$countryyear=="Portugal 2005"] <- 1

# Party voted for
cses2.vote <- cses2[, c("B3006_1", "B3006_2")]
colnames(cses2.vote) <- c("vote1", "vote2")
cses2.vote$vote1[cses2.vote$vote1>=90] <- NA
cses2.vote$vote2[cses2.vote$vote2>=90] <- NA

## CSES 3

## Create identification variables. These are numeric country code, country name in English starting with a capital letter, year in four digits, numeric respondent identifier, and country-year(combining country name and year).
cses3.id <- read.fwf("cses3.dat", col.names=c("NULLVAR", "country.num", "nullvar2", "year", "respondent"),
widths=c(64, 4, 1, 4, 10))
cses3.id <- cses3.id[ , c(2,4,5)]
cses3.id$country <- as.factor(cses3.id$country.num)
levels(cses3.id$country) <- c("Australia", "Austria", "Brazil", "Belarus", "Canada", "Chile", "Taiwan", "Croatia", "Czech Republic", "Denmark", "Estonia", "Finland", "France", "Germany", "Greece", "Hong Kong", "Iceland", "Ireland", "Israel", "Japan", "Korea", "Latvia", "Mexico", "Netherlands", "New Zealand", 
                              "Norway", "Peru", "Philippines", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia", "South Africa", "Spain", "Sweden", "Switzerland", "Thailand", "Turkey", "United States", "Uruguay")
cses3.id$country <- as.character(cses3.id$country)
cses3.id$year <- as.character(cses3.id$year)
cses3.id$countryyear <- paste(cses3.id$country, cses3.id$year, sep=" ")
cses3.id <- with(cses3.id, data.frame(country.num=country.num, country=country, year=year, respondent=respondent, countryyear=countryyear))

## Party like-dislike scales. These are taken from CSES variables C3009_A, C3009_B, ... C3009_I
cses3.vars <- read.fwf("cses3.dat", col.names=c("VAR.NULL", "C3009_A", "C3009_B", "C3009_C", "C3009_D", "C3009_E", "C3009_F", "C3009_G", "C3009_H", "C3009_I"),
widths=c(291, 2, 2, 2, 2, 2, 2, 2, 2, 2))
cses3.vars <- cses3.vars[, -1]
cses3.vars$C3009_A[cses3.vars$C3009_A==96] <- NA
cses3.vars$C3009_A[cses3.vars$C3009_A==97] <- NA
cses3.vars$C3009_A[cses3.vars$C3009_A==98] <- NA
cses3.vars$C3009_A[cses3.vars$C3009_A==99] <- NA
cses3.vars$C3009_B[cses3.vars$C3009_B==96] <- NA
cses3.vars$C3009_B[cses3.vars$C3009_B==97] <- NA
cses3.vars$C3009_B[cses3.vars$C3009_B==98] <- NA
cses3.vars$C3009_B[cses3.vars$C3009_B==99] <- NA
cses3.vars$C3009_C[cses3.vars$C3009_C==96] <- NA
cses3.vars$C3009_C[cses3.vars$C3009_C==97] <- NA
cses3.vars$C3009_C[cses3.vars$C3009_C==98] <- NA
cses3.vars$C3009_C[cses3.vars$C3009_C==99] <- NA
cses3.vars$C3009_D[cses3.vars$C3009_D==96] <- NA
cses3.vars$C3009_D[cses3.vars$C3009_D==97] <- NA
cses3.vars$C3009_D[cses3.vars$C3009_D==98] <- NA
cses3.vars$C3009_D[cses3.vars$C3009_D==99] <- NA
cses3.vars$C3009_E[cses3.vars$C3009_E==96] <- NA
cses3.vars$C3009_E[cses3.vars$C3009_E==97] <- NA
cses3.vars$C3009_E[cses3.vars$C3009_E==98] <- NA
cses3.vars$C3009_E[cses3.vars$C3009_E==99] <- NA
cses3.vars$C3009_F[cses3.vars$C3009_F==96] <- NA
cses3.vars$C3009_F[cses3.vars$C3009_F==97] <- NA
cses3.vars$C3009_F[cses3.vars$C3009_F==98] <- NA
cses3.vars$C3009_F[cses3.vars$C3009_F==99] <- NA
cses3.vars$C3009_G[cses3.vars$C3009_G==96] <- NA
cses3.vars$C3009_G[cses3.vars$C3009_G==97] <- NA
cses3.vars$C3009_G[cses3.vars$C3009_G==98] <- NA
cses3.vars$C3009_G[cses3.vars$C3009_G==99] <- NA
cses3.vars$C3009_H[cses3.vars$C3009_H==96] <- NA
cses3.vars$C3009_H[cses3.vars$C3009_H==97] <- NA
cses3.vars$C3009_H[cses3.vars$C3009_H==98] <- NA
cses3.vars$C3009_H[cses3.vars$C3009_H==99] <- NA
cses3.vars$C3009_I[cses3.vars$C3009_I==96] <- NA
cses3.vars$C3009_I[cses3.vars$C3009_I==97] <- NA
cses3.vars$C3009_I[cses3.vars$C3009_I==98] <- NA
cses3.vars$C3009_I[cses3.vars$C3009_I==99] <- NA
colnames(cses3.vars) <- c("likedislike1", "likedislike2", "likedislike3", "likedislike4", "likedislike5", "likedislike6", "likedislike7", "likedislike8", "likedislike9")

## Individual-level data on political information, education, and income
cses3.ind <- read.fwf("cses3.dat", col.names=c("VAR.NULL", "education", "NULL", "income", "NULL", "polinfo1", "polinfo2", "polinfo3"),
widths=c(211, 2, 20, 1, 294, 1, 1, 1))
cses3.ind <- cses3.ind[,c(2,4,6,7,8)]
cses3.ind$income[cses3.ind$income > 5] <- NA
cses3.ind$education[cses3.ind$education > 8] <- NA
cses3.ind$polinfo1[cses3.ind$polinfo1 == 9] <- NA
cses3.ind$polinfo2[cses3.ind$polinfo2 == 9] <- NA
cses3.ind$polinfo3[cses3.ind$polinfo3 == 9] <- NA

## Gallagher disproportionality index using the vote percentage of each party in the lower house (variables C5001_A to C5001_I in CSES) and their seat percentage (variables C5002_A to C5002_I in CSES)

cses3.disprop <- read.fwf("cses3.dat",col.names=c("NULLVAR", "votes.party1", "votes.party2", "votes.party3", "votes.party4", "votes.party5", "votes.party6", "votes.party7", "votes.party8", "votes.party9", "seats.party1", "seats.party2", "seats.party3", "seats.party4", "seats.party5", "seats.party6", "seats.party7", "seats.party8", "seats.party9"),
widths=c(591,5,5,5,5,5,5,4,4,4,5,5,5,5,5,4,4,4,4))
cses3.disprop <- cses3.disprop[,2:19]
cses3.disprop$votes.party1[cses3.disprop$votes.party1==997] <- NA
cses3.disprop$votes.party1[cses3.disprop$votes.party1==998] <- NA
cses3.disprop$votes.party1[cses3.disprop$votes.party1==999] <- NA

cses3.disprop$votes.party2[cses3.disprop$votes.party2==997] <- NA
cses3.disprop$votes.party2[cses3.disprop$votes.party2==998] <- NA
cses3.disprop$votes.party2[cses3.disprop$votes.party2==999] <- NA

cses3.disprop$votes.party3[cses3.disprop$votes.party3==997] <- NA
cses3.disprop$votes.party3[cses3.disprop$votes.party3==998] <- NA
cses3.disprop$votes.party3[cses3.disprop$votes.party3==999] <- NA

cses3.disprop$votes.party4[cses3.disprop$votes.party4==997] <- NA
cses3.disprop$votes.party4[cses3.disprop$votes.party4==998] <- NA
cses3.disprop$votes.party4[cses3.disprop$votes.party4==999] <- NA

cses3.disprop$votes.party5[cses3.disprop$votes.party5==997] <- NA
cses3.disprop$votes.party5[cses3.disprop$votes.party5==998] <- NA
cses3.disprop$votes.party5[cses3.disprop$votes.party5==999] <- NA

cses3.disprop$votes.party6[cses3.disprop$votes.party6==997] <- NA
cses3.disprop$votes.party6[cses3.disprop$votes.party6==998] <- NA
cses3.disprop$votes.party6[cses3.disprop$votes.party6==999] <- NA

cses3.disprop$votes.party7[cses3.disprop$votes.party7==997] <- NA
cses3.disprop$votes.party7[cses3.disprop$votes.party7==998] <- NA
cses3.disprop$votes.party7[cses3.disprop$votes.party7==999] <- NA

cses3.disprop$votes.party8[cses3.disprop$votes.party8==997] <- NA
cses3.disprop$votes.party8[cses3.disprop$votes.party8==998] <- NA
cses3.disprop$votes.party8[cses3.disprop$votes.party8==999] <- NA

cses3.disprop$votes.party9[cses3.disprop$votes.party9==997] <- NA
cses3.disprop$votes.party9[cses3.disprop$votes.party9==998] <- NA
cses3.disprop$votes.party9[cses3.disprop$votes.party9==999] <- NA

cses3.disprop$seats.party1[cses3.disprop$seats.party1==997] <- NA
cses3.disprop$seats.party1[cses3.disprop$seats.party1==998] <- NA
cses3.disprop$seats.party1[cses3.disprop$seats.party1==999] <- NA

cses3.disprop$seats.party2[cses3.disprop$seats.party2==997] <- NA
cses3.disprop$seats.party2[cses3.disprop$seats.party2==998] <- NA
cses3.disprop$seats.party2[cses3.disprop$seats.party2==999] <- NA

cses3.disprop$seats.party3[cses3.disprop$seats.party3==997] <- NA
cses3.disprop$seats.party3[cses3.disprop$seats.party3==998] <- NA
cses3.disprop$seats.party3[cses3.disprop$seats.party3==999] <- NA

cses3.disprop$seats.party4[cses3.disprop$seats.party4==997] <- NA
cses3.disprop$seats.party4[cses3.disprop$seats.party4==998] <- NA
cses3.disprop$seats.party4[cses3.disprop$seats.party4==999] <- NA

cses3.disprop$seats.party5[cses3.disprop$seats.party5==997] <- NA
cses3.disprop$seats.party5[cses3.disprop$seats.party5==998] <- NA
cses3.disprop$seats.party5[cses3.disprop$seats.party5==999] <- NA

cses3.disprop$seats.party6[cses3.disprop$seats.party6==997] <- NA
cses3.disprop$seats.party6[cses3.disprop$seats.party6==998] <- NA
cses3.disprop$seats.party6[cses3.disprop$seats.party6==999] <- NA

cses3.disprop$seats.party7[cses3.disprop$seats.party7==997] <- NA
cses3.disprop$seats.party7[cses3.disprop$seats.party7==998] <- NA
cses3.disprop$seats.party7[cses3.disprop$seats.party7==999] <- NA

cses3.disprop$seats.party8[cses3.disprop$seats.party8==997] <- NA
cses3.disprop$seats.party8[cses3.disprop$seats.party8==998] <- NA
cses3.disprop$seats.party8[cses3.disprop$seats.party8==999] <- NA

cses3.disprop$seats.party9[cses3.disprop$seats.party9==997] <- NA
cses3.disprop$seats.party9[cses3.disprop$seats.party9==998] <- NA
cses3.disprop$seats.party9[cses3.disprop$seats.party9==999] <- NA

for(i in 1:9){
  cses3.disprop[,i+18] <- (cses3.disprop[,i] - cses3.disprop[,i+9])^2
}

cses3.disprop$sumdiff <- apply(cses3.disprop[,18:27],1, sum, na.rm=T)
cses3.disprop$gallagher <- sqrt(cses3.disprop$sumdiff/2)  
cses3.gallagher <- cses3.disprop$gallagher

## Calculate mean district magnitude

cses3.mdm <- read.fwf("cses3.dat", widths=c(1070,3,7,3,7,3,14,3), col.names=c("NULL", "C5063", "NULL", "C5066", "NULL", "C5069", "NULL", "C5075"))
cses3.mdm <- cses3.mdm[,c(2,4,6,8)]
cses3.mdm$C5063[cses3.mdm$C5063==999] <- NA
cses3.mdm$C5066[cses3.mdm$C5066>=997] <- NA
cses3.mdm$C5069[cses3.mdm$C5069>=997] <- NA
cses3.mdm$C5075[cses3.mdm$C5075==999] <- NA
for (i in 1:length(cses3.mdm$C5063)){
  cses3.mdm$mdm[i] <- cses3.mdm$C5075[i]/(apply(cses3.mdm[i,1:3], 1, sum, na.rm=T))
}

## Generate variable for type of election
cses3.type <- read.fwf("cses3.dat", col.names=c("VAR.NULL", "type"), widths=c(172, 2))

## Get weights
cses3.weights <- read.fwf("cses3.dat", col.names=c("VAR.NULL", "sampleweight", "demoweight", "polweight"),
widths=c(83, 10, 7, 6))
cses3.weights$VAR.NULL <- NULL

# Adjust weights for Croatia. Political weights include demographic weights
cses3.weights[cses3.id$country=="Croatia",2] <- 1

# Adjust weights for France. Political weights seem to include other weights
cses3.weights[cses3.id$country=="France", c(1,2)] <- 1

# Adjust weights for Germany. Sampling weights include demographic weights
cses3.weights[cses3.id$country=="Germany", 2] <- 1

# Adjust weights for Netherlands. Political weights include demographic weights
cses3.weights$demoweight[cses3.id$country=="Netherlands"] <- 1

# Adjust weights for New Zealand. Political weights include demographic weights.
cses3.weights$demoweight[cses3.id$country=="New Zealand"] <- 1

# Adjust weights for Switzerland. Political weights include sample weights.
cses3.weights$sampleweight[cses3.id$country=="Switzerland"] <- 1

# Party voted for
cses3.vote <- read.fwf("cses3.dat", widths=c(425,2,2), col.names=c("NULL", "vote1", "vote2"))
cses3.vote<- cses3.vote[,-1]
cses3.vote$vote1[cses3.vote$vote1>=89] <- NA
cses3.vote$vote2[cses3.vote$vote2>=89] <- NA

# Find out whether voted

cses3.tmp <- read.dta("cses3.dta")
cses3.ind$voted <- as.numeric(cses3.tmp$C3021_1)
cses3.ind$voted[cses3.ind$voted>2] <- NA
cses3.ind$voted <- ifelse(cses3.ind$voted==1,1,0)

## Get macro data on regime type and electoral system
## The variable regime is from Cheibub, Gandhi, and Vreeland (2010). electoral sytem and proportional are from Matt Golder. The first gives gives the type of electoral system (majoritarian, proportional or mixed). The second is a dummy coded 0 for majoritarian systems and 1 for proportional or mixed dependent systems. 
golder.data <- read.csv("regimes.csv", header=T)
golder.data$country <- as.character(golder.data$country)
golder.data$year <- as.character(golder.data$year)

## The following amendments to the Golder data fill in some missing data. Source: IDEA, Electoral System Design: the New International IDEA Handbook
golder.data$proportional[golder.data$country=="Greece"] <- 1 
golder.data$proportional[golder.data$country=="Korea"] <- 0
golder.data$proportional[golder.data$country=="Japan"] <- 0
golder.data$proportional[golder.data$country=="Taiwan"] <- 0
golder.data$proportional[golder.data$country=="Thailand"] <- 0

## Cabinet data. These data consist of three variables created by Eric Guntermann (based on earlier work by Marc-André Bodet). They are a dummy(incabinet1, incabinet2, ..., incabinet9) coded 1 if parties 1 to 9 are in cabinet and 0 if they are not. 
## propseats1 to propseats9 give the proportion of all cabinet seats held by a given party. 
## proppseats1 to 6 provide the proportion of all cabinet seats held by party members that are occupied by members of each party. 

cabinet <- read.csv("cabinet.csv")
cabinet$countryyear <- as.character(cabinet$countryyear)
cabinet$totalseatsp <- cabinet$totalseats.1
cabinet$totalseats.1 <- NULL

## Freedom house data
fh <- read.csv("fhdata.csv")

## World Bank data
wb <- read.csv("wbdata.csv")

## Bring together modules 1 to 3

# CSES1
cses1 <- data.frame(cses1.id, cses1.vars, cses1.ind, cses1.type, gallagher=cses1.gallagher, cses1.weights, cses1.disprop[, 7:12], mdm=cses1.mdm$mdm, cses1.vote)

## Create variables for parties 7 to 9 with values NA so CSES1 has the same variables as CSES2 and CSES3
cses1$seats.party7 <- NA
cses1$seats.party8 <- NA
cses1$seats.party9 <- NA
cses1 <- merge(cses1, cabinet, by=c("countryyear"))
cses1 <- merge(cses1, golder.data, by=c("country", "year"))
cses1 <- merge(cses1, fh[,c(2,3)], by="countryyear")
cses1 <- merge(cses1, wb[,c(2,3)], by=c("countryyear"))

## Only keep legislative elections
cses1 <- cses1[as.numeric(cses1$cses1.type)<4, ]
cses1$cses1.type <- NULL

## Move variables around

cses1 <- cses1[,c("country","year","country.num", "respondent", "countryyear", "education", "income", "polinfo1", "polinfo2", "polinfo3", "voted", "likedislike1", "likedislike2", "likedislike3", "likedislike4", "likedislike5",   
                  "likedislike6", "likedislike7", "likedislike8", "likedislike9", "gallagher", "incabinet1", "incabinet2", "incabinet3", "incabinet4", "incabinet5", "incabinet6", "incabinet7", "incabinet8", "incabinet9",
                  "propseats1", "propseats2", "propseats3", "propseats4", "propseats5", "propseats6", "propseats7", "propseats8", "propseats9", "totalseats", "proppseats1", "proppseats2", "proppseats3", "proppseats4", "proppseats5",    
                  "proppseats6", "proppseats7", "proppseats8", "proppseats9", "totalseatsp", "seats.party1", "seats.party2", "seats.party3", "seats.party4", "seats.party5", "seats.party6", "seats.party7", "seats.party8", "seats.party9", "regime", "electoralsystem", "proportional", "sampleweight", "demoweight", "polweight", "mdm", "vote1", "vote2", "freedomhouse", "gdppercap")]

## Convert all identifiers to character
for (i in 1:5){
  cses1[,i] <- as.character(cses1[,i])
}

## Crete module identifier
cses1$module <- 1

# CSES2 

cses2 <- data.frame(cses2.id, cses2.ind, cses2.vars, gallagher=cses2.gallagher, cses2.type, cses2.weights, cses2.disprop[, 10:18], mdm=cses2.mdm$mdm, cses2.vote)
cses2 <- merge(cses2, golder.data, by=c("country", "year"))
cses2 <- merge(cses2, cabinet, by=c("countryyear"))
cses2 <- merge(cses2, fh[,c(2,3)], by=c("countryyear"))
cses2 <- merge(cses2, wb[,c(2,3)], by=c("countryyear"))

## Only keep legislative elections

cses2 <- cses2[cses2$cses2.type<20, ]
cses2$cses2.type <- NULL

## Move variables around
cses2 <- cses2[,c("country","year","country.num", "respondent", "countryyear", "education", "income", "polinfo1", "polinfo2", "polinfo3", "voted", "likedislike1", "likedislike2", "likedislike3", "likedislike4", "likedislike5",   
                  "likedislike6", "likedislike7", "likedislike8", "likedislike9", "gallagher", "incabinet1", "incabinet2", "incabinet3", "incabinet4", "incabinet5", "incabinet6", "incabinet7", "incabinet8", "incabinet9",
                  "propseats1", "propseats2", "propseats3", "propseats4", "propseats5", "propseats6", "propseats7", "propseats8", "propseats9", "totalseats", "proppseats1", "proppseats2", "proppseats3", "proppseats4", "proppseats5",    
                  "proppseats6", "proppseats7", "proppseats8", "proppseats9", "totalseatsp", "seats.party1", "seats.party2", "seats.party3", "seats.party4", "seats.party5", "seats.party6", "seats.party7", "seats.party8", "seats.party9", "regime", "electoralsystem", "proportional", "sampleweight", "demoweight", "polweight", "mdm", "vote1", "vote2", "freedomhouse", "gdppercap")]

cses2$module <- 2

## Convert all identifiers to character
for (i in 1:5){
  cses2[,i] <- as.character(cses2[,i])
}


# CSES3 

cses3.id$country <- as.character(cses3.id$country)
cses3.id$year <- as.character(cses3.id$year)
cses3 <- data.frame(cses3.id, cses3.ind, cses3.vars, gallagher=cses3.gallagher, type=cses3.type$type, cses3.weights, cses3.disprop[, 10:18], mdm=cses3.mdm$mdm, cses3.vote)
cses3 <- merge(cses3, golder.data, by=c("country", "year"))
cses3 <- merge(cses3, cabinet, by=c("countryyear"))
cses3 <- merge(cses3, fh[,2:3], by=c("countryyear"))
cses3 <- merge(cses3, wb[,2:3], by=c("countryyear"))

## Only keep legislative elections

cses3 <- cses3[cses3$type<20, ]
cses3$type <- NULL

## Move variables around
cses3 <- cses3[,c("country","year","country.num", "respondent", "countryyear", "education", "income", "polinfo1", "polinfo2", "polinfo3", "voted", "likedislike1", "likedislike2", "likedislike3", "likedislike4", "likedislike5",   
                  "likedislike6", "likedislike7", "likedislike8", "likedislike9", "gallagher", "mdm", "incabinet1", "incabinet2", "incabinet3", "incabinet4", "incabinet5", "incabinet6", "incabinet7", "incabinet8", "incabinet9",
                  "propseats1", "propseats2", "propseats3", "propseats4", "propseats5", "propseats6", "propseats7", "propseats8", "propseats9", "totalseats", "proppseats1", "proppseats2", "proppseats3", "proppseats4", "proppseats5",    
                  "proppseats6", "proppseats7", "proppseats8", "proppseats9", "totalseatsp", "seats.party1", "seats.party2", "seats.party3", "seats.party4", "seats.party5", "seats.party6", "seats.party7", "seats.party8", "seats.party9", "regime", "electoralsystem", "proportional", "sampleweight", "demoweight", "polweight", "vote1", "vote2", "freedomhouse", "gdppercap")]

cses3$module <- 3

## Convert all identifiers to character
for (i in 1:5){
  cses3[,i] <- as.character(cses3[,i])
}


## Merge CSES1, CSES2, CSES3

cses <- rbind(cses1, cses2, cses3)

## Remove Japan 2004 and Japan 2007, both of which were elections to the House of Coucillors (the upper chamber)
cses <- cses[cses$countryyear!="Japan 2004" &  cses$countryyear!="Japan 2007",]
## Remove Ukraine 1998, since the prime minister was non-partisan.
cses <- cses[cses$countryyear!="Ukraine 1998", ]


## CSES4

## Load dataset
library(readstata13)
cses4 <- read.dta13("cses4.dta")

## Create identification variables. These are numeric country code, country name in English starting with a capital letter, year in four digits, numeric respondent identifier, and country-year(combining country name and year).
cses4.id <- cses4[ , c("D1006", "D1008", "D1009")]
colnames(cses4.id) <- c("country.num", "year", "respondent")
cses4.id$country <- as.factor(cses4.id$country.num)
cses4.id$year <- as.character(cses4.id$year)
cses4.id$respondent <- as.character(cses4.id$respondent)
levels(cses4.id$country) <- c("Australia", "Austria", "Taiwan", "France", "Germany", "Greece", "Iceland", "Ireland", "Japan", "Mexico", "Montenegro", "New Zealand", "Poland", "Serbia", "Switzerland", "Thailand", "United States")
cses4.id$country <- as.character(cses4.id$country)
cses4.id$countryyear <- paste(cses4.id$country, cses4.id$year, sep=" ")
cses4.id <- cses4.id[,2:5]

## Generate variable for type of election
cses4.type <- cses4$D1015

# Party like-dislike scales. These are taken from CSES variables D3011_A, D3011_B, ... D3011_I
cses4.vars <- cses4[ ,112:120] 
cses4.vars$D3011_A[cses4.vars$D3011_A>10] <- NA
cses4.vars$D3011_B[cses4.vars$D3011_B>10] <- NA
cses4.vars$D3011_C[cses4.vars$D3011_C>10] <- NA
cses4.vars$D3011_D[cses4.vars$D3011_D>10] <- NA
cses4.vars$D3011_E[cses4.vars$D3011_E>10] <- NA
cses4.vars$D3011_F[cses4.vars$D3011_F>10] <- NA
cses4.vars$D3011_G[cses4.vars$D3011_G>10] <- NA
cses4.vars$D3011_H[cses4.vars$D3011_H>10] <- NA
cses4.vars$D3011_I[cses4.vars$D3011_I>10] <- NA
colnames(cses4.vars) <- c("likedislike1", "likedislike2", "likedislike3", "likedislike4", "likedislike5", "likedislike6", "likedislike7", "likedislike8", "likedislike9")

## Individual-level data on political information, education, and income
cses4.ind <- cses4[, c("D2003", "D2020", "D3025_1_A", "D3025_2_A", "D3025_3_A", "D3025_4_A", "D3005_LH")]
colnames(cses4.ind) <- c("education", "income", "polinfo1", "polinfo2", "polinfo3", "polinfo4", "voted")
cses4.ind$income <- as.numeric(cses4.ind$income)
cses4.ind$income[cses4.ind$income>5] <- NA
cses4.ind$education <- as.numeric(cses4.ind$education)
cses4.ind$education[cses4.ind$education>10] <- NA
cses4.ind$education[cses4.ind$education==10] <- 0

cses4.ind$polinfo1 <- as.numeric(cses4.ind$polinfo1)
cses4.ind$polinfo2 <- as.numeric(cses4.ind$polinfo2)
cses4.ind$polinfo3 <- as.numeric(cses4.ind$polinfo3)
cses4.ind$polinfo4 <- as.numeric(cses4.ind$polinfo4)
cses4.ind$polinfo1[cses4.ind$polinfo1==5] <- NA
cses4.ind$polinfo2[cses4.ind$polinfo2==5] <- NA
cses4.ind$polinfo3[cses4.ind$polinfo3==5] <- NA
cses4.ind$polinfo4[cses4.ind$polinfo4==5] <- NA
cses4.ind$voted <- as.numeric(cses4.ind$voted)
cses4.ind$voted[cses4.ind$voted>2] <- NA
cses4.ind$voted <- ifelse(cses4.ind$voted==1,1,0)


# Remove unemployment rate question
cses4.ind$polinfo2 <- NULL
colnames(cses4.ind) <- c("education", "income", "polinfo1", "polinfo3", "polinfo4", "voted")

## Gallagher disproportionality index using the vote percentage of each party in the lower house (variables D5001_A to D5001_I in CSES) and their seat percentage (variables D5002_A to D5002_I in CSES)
cses4.disprop <- cses4[, c("D5001_A", "D5001_B", "D5001_C", "D5001_D", "D5001_E", "D5001_F", "D5001_G", "D5001_H", "D5001_I", "D5002_A", "D5002_B", "D5002_C", "D5002_D", "D5002_E", "D5002_F", "D5002_G", "D5002_H","D5002_I")]
colnames(cses4.disprop) <- c("votes.party1", "votes.party2", "votes.party3", "votes.party4", "votes.party5", "votes.party6", "votes.party7", "votes.party8", "votes.party9", "seats.party1", "seats.party2", "seats.party3", "seats.party4", "seats.party5", "seats.party6", "seats.party7", "seats.party8", "seats.party9")
cses4.disprop$votes.party1[cses4.disprop$votes.party1==999] <- NA
cses4.disprop$votes.party2[cses4.disprop$votes.party2==999] <- NA
cses4.disprop$votes.party3[cses4.disprop$votes.party3==999] <- NA
cses4.disprop$votes.party4[cses4.disprop$votes.party4==999] <- NA
cses4.disprop$votes.party5[cses4.disprop$votes.party5==999] <- NA
cses4.disprop$votes.party6[cses4.disprop$votes.party6==999] <- NA
cses4.disprop$votes.party7[cses4.disprop$votes.party7==999] <- NA
cses4.disprop$votes.party8[cses4.disprop$votes.party8==999] <- NA
cses4.disprop$votes.party9[cses4.disprop$votes.party9==999] <- NA


cses4.disprop$seats.party1[cses4.disprop$seats.party1==999] <- NA
cses4.disprop$seats.party2[cses4.disprop$seats.party2==999] <- NA
cses4.disprop$seats.party3[cses4.disprop$seats.party3==999] <- NA
cses4.disprop$seats.party4[cses4.disprop$seats.party4==999] <- NA
cses4.disprop$seats.party5[cses4.disprop$seats.party5==999] <- NA
cses4.disprop$seats.party6[cses4.disprop$seats.party6==999] <- NA
cses4.disprop$seats.party7[cses4.disprop$seats.party7==999] <- NA
cses4.disprop$seats.party8[cses4.disprop$seats.party8==999] <- NA
cses4.disprop$seats.party9[cses4.disprop$seats.party9==999] <- NA

# Add data for Japan


cses4.disprop$votes.party1[cses4.id$countryyear=="Japan 2013"] <- 34.68 # Liberal Democratic Party,
cses4.disprop$votes.party2[cses4.id$countryyear=="Japan 2013"] <- 13.4 # Democratic Party of Japan
cses4.disprop$votes.party3[cses4.id$countryyear=="Japan 2013"] <- 9.68 # Japanese Communist Party
cses4.disprop$votes.party4[cses4.id$countryyear=="Japan 2013"] <- 8.93 # Your Party
cses4.disprop$votes.party5[cses4.id$countryyear=="Japan 2013"] <- 11.94 # Japan Restoration Party
cses4.disprop$votes.party6[cses4.id$countryyear=="Japan 2013"] <- 14.22 # New Komeito
cses4.disprop$votes.party7[cses4.id$countryyear=="Japan 2013"] <- 1.77 # People's Life Party
cses4.disprop$votes.party8[cses4.id$countryyear=="Japan 2013"] <- 2.36 # Social Democratic Party
cses4.disprop$votes.party9[cses4.id$countryyear=="Japan 2013"] <- NA # Green Wind


cses4.disprop$seats.party1[cses4.disprop$seats.party1==999] <- NA
cses4.disprop$seats.party2[cses4.disprop$seats.party2==999] <- NA
cses4.disprop$seats.party3[cses4.disprop$seats.party3==999] <- NA
cses4.disprop$seats.party4[cses4.disprop$seats.party4==999] <- NA
cses4.disprop$seats.party5[cses4.disprop$seats.party5==999] <- NA
cses4.disprop$seats.party6[cses4.disprop$seats.party6==999] <- NA
cses4.disprop$seats.party7[cses4.disprop$seats.party7==999] <- NA
cses4.disprop$seats.party8[cses4.disprop$seats.party8==999] <- NA
cses4.disprop$seats.party9[cses4.disprop$seats.party9==999] <- NA



for(i in 1:9){
  cses4.disprop[,i+18] <- (cses4.disprop[,i] - cses4.disprop[,i+9])^2
}

cses4.disprop$sumdiff <- apply(cses4.disprop[,18:27],1, sum, na.rm=T)
cses4.disprop$gallagher <- sqrt(cses4.disprop$sumdiff/2)  
cses4.gallagher <- cses4.disprop$gallagher

## Calculate mean district magnitude

cses4.mdm <- cses4[,c("D5063", "D5066", "D5069", "D5075")]
cses4.mdm$D5063[cses4.mdm$D5063==999] <- NA
cses4.mdm$D5066[cses4.mdm$D5066>=997] <- NA
cses4.mdm$D5069[cses4.mdm$D5069>=997] <- NA
cses4.mdm$D5075[cses4.mdm$D5075==999] <- NA
for (i in 1:length(cses4.mdm$D5063)){
  cses4.mdm$mdm[i] <- cses4.mdm$D5075[i]/(apply(cses4.mdm[i,1:3], 1, sum, na.rm=T))
}

## Get weights
cses4.weights <- cses4[, c("D1010_1", "D1010_2", "D1010_3")]
colnames(cses4.weights) <- c("sampleweight", "demoweight", "polweight")

# Adjust weights for New Zealand. Political weights include demographic weights
cses4.weights[cses4.id$country=="New Zealand",2] <- 1

# Adjust weights for Serbia. Demographic weights include sampling weights
cses4.weights[cses4.id$country=="Serbia", 1] <- 1

# Adjust weights for Switzerland. Political weights include sampling weights. 
cses4.weights[cses4.id$country=="Switzerland", 1] <- 1

# Vote choice

cses4.vote <- cses4[,c("D3006_LH_PL", "D3006_LH_DC")]
colnames(cses4.vote) <- c("vote1", "vote2")
cses4.vote[cses4.vote>93] <- NA


## Get macro data on regime type and electoral system
## The variable regime is from Cheibub, Gandhi, and Vreeland (2010). electoral sytem and proportional are from Matt Golder. The first gives gives the type of electoral system (majoritarian, proportional or mixed). The second is a dummy coded 0 for majoritarian systems and 1 for proportional or mixed dependent systems. 
golder.data <- read.csv("regimes2.csv", header=T)
golder.data$country <- as.character(golder.data$country)
golder.data$year <- as.character(golder.data$year)

## Freedom house data
fh <- read.csv("fhdata.csv")
fh <- fh[fh$countryyear %in% c("Australia 2013", "Austria 2013", "Taiwan 2012", "France 2012", 
                               "Germany 2013", "Greece 2012", "Iceland 2013", "Ireland 2011", 
                               "Japan 2013", "Mexico 2012", "Montenegro 2012", "New Zealand 2011", 
                               "Poland 2011", "Serbia 2012", "Switzerland 2011", "Thailand 2011", 
                               "United States 2012"),]
## World Bank data
wb <- read.csv("wbdata.csv")
wb <- wb[wb$countryyear %in% c("Australia 2013", "Austria 2013", "Taiwan 2012", "France 2012", 
                               "Germany 2013", "Greece 2012", "Iceland 2013", "Ireland 2011", 
                               "Japan 2013", "Mexico 2012", "Montenegro 2012", "New Zealand 2011", 
                               "Poland 2011", "Serbia 2012", "Switzerland 2011", "Thailand 2011", 
                               "United States 2012"),]
# Cabinet data
cabinet <- read.csv("cabinet_cses4.csv")
cabinet$countryyear <- as.character(cabinet$countryyear)
cabinet$totalseatsp <- cabinet$totalseats.1
cabinet$totalseats.1 <- NULL

# Note that Serbia 2012 was removed because there were too many missing parties


# Add CSES 4 to other modules
library(foreign)
cses4 <- data.frame(cses4.id, cses4.ind, cses4.vars, gallagher=cses4.gallagher, type=cses4.type, cses4.weights, cses4.disprop[, 10:18], mdm=cses4.mdm$mdm, cses4.vote)
## Only keep legislative elections
cses4$type <- as.numeric(cses4.type)
cses4 <- cses4[cses4$type<4, ]
cses4$type <- NULL
cses4 <- merge(cses4, golder.data, by=c("country", "year"))
cses4 <- merge(cses4, cabinet, by=c("countryyear"))
cses4 <- merge(cses4, fh[,2:3], by=c("countryyear"))
cses4 <- merge(cses4, wb[,2:3], by=c("countryyear"))



# Add module number
cses4$module <- 4


## Move variables around
cses <- cses[, c('country', 'year', 'respondent', 'countryyear', 'education', 'income', 'polinfo1', 'polinfo2', 'polinfo3', "voted", 'likedislike1', 'likedislike2', 'likedislike3', 'likedislike4', 'likedislike5', 'likedislike6', 'likedislike7', 'likedislike8', 'likedislike9', 'gallagher', 'incabinet1', 'incabinet2', 'incabinet3', 'incabinet4', 'incabinet5', 'incabinet6', 'incabinet7', 'incabinet8', 'incabinet9', 'propseats1', 'propseats2', 'propseats3', 'propseats4', 'propseats5', 'propseats6', 'propseats7', 'propseats8', 'propseats9', 'totalseats', 'proppseats1', 'proppseats2', 'proppseats3', 'proppseats4', 'proppseats5', 'proppseats6', 'proppseats7', 'proppseats8', 'proppseats9', 'totalseatsp', 'seats.party1', 'seats.party2', 'seats.party3', 'seats.party4', 'seats.party5', 'seats.party6', 'seats.party7', 'seats.party8', 'seats.party9', 'regime', 'electoralsystem', 'proportional', 'sampleweight', 'demoweight', 'polweight', 'mdm', 'vote1', 'vote2', 'freedomhouse', 'gdppercap', 'module')]
colnames(cses4) <- c('countryyear', 'country', 'year', 'respondent', 'education', 'income', 'polinfo1', 'polinfo2', 'polinfo3', "voted", 'likedislike1', 'likedislike2', 'likedislike3', 'likedislike4', 'likedislike5', 'likedislike6', 'likedislike7', 'likedislike8', 'likedislike9', 'gallagher', 'sampleweight', 'demoweight', 'polweight', 'seats.party1', 'seats.party2', 'seats.party3', 'seats.party4', 'seats.party5', 'seats.party6', 'seats.party7', 'seats.party8', 'seats.party9', 'mdm', 'vote1', 'vote2', 'regime', 'electoralsystem', 'proportional', 'incabinet1', 'incabinet2', 'incabinet3', 'incabinet4', 'incabinet5', 'incabinet6', 'incabinet7', 'incabinet8', 'incabinet9', 'propseats1', 'propseats2', 'propseats3', 'propseats4', 'propseats5', 'propseats6', 'propseats7', 'propseats8', 'propseats9', 'totalseats', 'proppseats1', 'proppseats2', 'proppseats3', 'proppseats4', 'proppseats5', 'proppseats6', 'proppseats7', 'proppseats8', 'proppseats9', 'totalseatsp', 'freedomhouse', 'gdppercap', 'module')
cses4 <- cses4[, c('country', 'year', 'respondent', 'countryyear', 'education', 'income', 'polinfo1', 'polinfo2', 'polinfo3', "voted", 'likedislike1', 'likedislike2', 'likedislike3', 'likedislike4', 'likedislike5', 'likedislike6', 'likedislike7', 'likedislike8', 'likedislike9', 'gallagher', 'incabinet1', 'incabinet2', 'incabinet3', 'incabinet4', 'incabinet5', 'incabinet6', 'incabinet7', 'incabinet8', 'incabinet9', 'propseats1', 'propseats2', 'propseats3', 'propseats4', 'propseats5', 'propseats6', 'propseats7', 'propseats8', 'propseats9', 'totalseats', 'proppseats1', 'proppseats2', 'proppseats3', 'proppseats4', 'proppseats5', 'proppseats6', 'proppseats7', 'proppseats8', 'proppseats9', 'totalseatsp', 'seats.party1', 'seats.party2', 'seats.party3', 'seats.party4', 'seats.party5', 'seats.party6', 'seats.party7', 'seats.party8', 'seats.party9', 'regime', 'electoralsystem', 'proportional', 'sampleweight', 'demoweight', 'polweight', 'mdm', 'vote1', 'vote2', 'freedomhouse', 'gdppercap', 'module')]
cses1_4 <- rbind(cses,cses4)

save(cses1_4, file="cses.Rda")

